#Course: BUAN 5210
#Purpose: Analyze effectiveness of Advertisement and Promotion, and comparison with competitors
#Date: 11/02/2019
#Author: Akira Nakagawa, Anh Doan

Preparing libraries and data for analysis

# -- Clear environment of variables and functions------------------
rm(list = ls(all = TRUE)) 

# Clear environmet of packages
if(is.null(sessionInfo()$otherPkgs) == FALSE)lapply(paste("package:", names(sessionInfo()$otherPkgs), sep=""), detach, character.only = TRUE, unload = TRUE)

# -- Load packages ---------------------------------------
#install.packages("tidyverse") 
library(tidyverse)
# The gridExtra package contains grid.arrange function used to combine plots in the same window
library(gridExtra)
# The janitor package contains tidyverse functions for cross-tables
library(janitor)
# The knitr package contains some table formating functions
library(knitr)
# The GGally package contains a custom correlation plot we will use
library(GGally)
library(MultinomialCI)
library(htmlTable)
library(Hmisc)
library(formattable)
library(rms)
library(MultNonParam)


# -- Load data from mtp_data.csv -----------------------------------
mtp <- read.csv("mtp_data.csv")

# -- data cleansing  -----------------------------------
# Convert into factor variables
mtp$promo <- factor(mtp$promo)
mtp$iri_key <- factor(mtp$iri_key)

# Create a column for Company based on the column "brand"
Kell <- c("KELLOGGS COCOA KRISPIES", "KELLOGGS FROOT LOOPS", "KELLOGGS FROSTED MINI WHEATS", "KELLOGGS RAISIN BRAN", "KELLOGGS RICE KRISPIES", "KELLOGGS SMART START", "KELLOGGS SPECIAL K")
GM <- c("GENERAL MILLS CHEERIOS", "GENERAL MILLS CINNAMON TST CR", "GENERAL MILLS COCOA PUFFS", "GENERAL MILLS KIX", "GENERAL MILLS LUCKY CHARMS")
Po <- c("POST GRAPE NUTS", "POST SHREDDED WHEAT")
mtp <- transform(mtp, company= if_else( mtp$brand %in% Kell, "Kelloggs", if_else(mtp$brand %in% GM, "GM", "Post")))

Base EDA Step 1: Uni-variate non-graphical EDA

# to look at the data 
head(mtp)
##                 UPC iri_key week units                         brand
## 1 00-01-16000-11653  644347 1484     5 GENERAL MILLS CINNAMON TST CR
## 2 00-01-16000-11653  248741 1483     2 GENERAL MILLS CINNAMON TST CR
## 3 00-01-16000-11653  535806 1489     3 GENERAL MILLS CINNAMON TST CR
## 4 00-01-16000-11945  675634 1489     2        GENERAL MILLS CHEERIOS
## 5 00-01-16000-11945  205272 1491     8        GENERAL MILLS CHEERIOS
## 6 00-01-16000-11945  248741 1492     5        GENERAL MILLS CHEERIOS
##           flavor package volume price promo   ad company
## 1 CINNAMON TOAST     BOX   0.06   0.5     0    A      GM
## 2 CINNAMON TOAST     BOX   0.06   0.5     0 NONE      GM
## 3 CINNAMON TOAST     BOX   0.06   0.5     0 NONE      GM
## 4        TOASTED     BOX   0.04   0.5     0 NONE      GM
## 5        TOASTED     BOX   0.04   0.5     0 NONE      GM
## 6        TOASTED     BOX   0.04   0.5     0 NONE      GM

Comments

  • In the price (per package), we can see only “0.5” in the first 6 rows. Need to investigate in further EDA steps.
  • UPC is told to be a unique variable but it has duplications.
# to see how many observations, variables, types etc
str(mtp)
## 'data.frame':    21850 obs. of  12 variables:
##  $ UPC    : Factor w/ 114 levels "00-01-16000-11653",..: 1 1 1 2 2 2 3 3 3 3 ...
##  $ iri_key: Factor w/ 1420 levels "200171","200197",..: 1041 446 1018 1217 48 446 1295 794 1184 1043 ...
##  $ week   : int  1484 1483 1489 1489 1491 1492 1517 1513 1523 1483 ...
##  $ units  : int  5 2 3 2 8 5 6 1 4 14 ...
##  $ brand  : Factor w/ 15 levels "GENERAL MILLS CHEERIOS",..: 2 2 2 1 1 1 2 2 2 2 ...
##  $ flavor : Factor w/ 5 levels "CINNAMON TOAST",..: 1 1 1 5 5 5 1 1 1 1 ...
##  $ package: Factor w/ 2 levels "BOX","CUP": 1 1 1 1 1 1 2 2 2 2 ...
##  $ volume : num  0.06 0.06 0.06 0.04 0.04 0.04 0.12 0.12 0.12 0.12 ...
##  $ price  : num  0.5 0.5 0.5 0.5 0.5 0.5 1.09 1.59 1.59 1 ...
##  $ promo  : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ ad     : Factor w/ 3 levels "A","B","NONE": 1 3 3 3 3 3 3 3 3 3 ...
##  $ company: Factor w/ 3 levels "GM","Kelloggs",..: 1 1 1 1 1 1 1 1 1 1 ...

Comments

  • 5 Factor variables + promo(int)
    • Promo has to be factor based on the variable descriptions. We may need to be careful with this variable
    • UPC has 114 levels, which may be hard to show in visualization
    • Package only has 2 possible states: Box or Cup
  • 4 continuous integer variables
  • Assume iri-key and UPC are like a composite primary key
summary(mtp)
##                 UPC           iri_key           week          units       
##  00-01-43000-10521:  676   656444 :   35   Min.   :1479   Min.   : 1.000  
##  00-01-38000-01621:  666   256951 :   31   1st Qu.:1492   1st Qu.: 3.000  
##  00-01-38000-00828:  660   259661 :   31   Median :1505   Median : 7.000  
##  00-01-16000-27569:  639   267403 :   31   Mean   :1505   Mean   : 8.579  
##  00-02-38000-66330:  618   652139 :   31   3rd Qu.:1518   3rd Qu.:12.000  
##  00-01-38000-01611:  612   681735 :   31   Max.   :1530   Max.   :28.000  
##  (Other)          :17979   (Other):21660                                  
##                            brand                  flavor     package    
##  KELLOGGS FROSTED FLAKES      : 2295   CINNAMON TOAST:1834   BOX:21306  
##  KELLOGGS FROOT LOOPS         : 2192   COCOA         :1901   CUP:  544  
##  GENERAL MILLS CINNAMON TST CR: 1834   FRUIT         :2192              
##  GENERAL MILLS LUCKY CHARMS   : 1681   REGULAR       :8816              
##  KELLOGGS FROSTED MINI WHEATS : 1574   TOASTED       :7107              
##  GENERAL MILLS CHEERIOS       : 1458                                    
##  (Other)                      :10816                                    
##      volume          price       promo        ad            company    
##  Min.   :0.040   Min.   :0.250   0:17305   A   : 1456   GM      :7189  
##  1st Qu.:0.750   1st Qu.:3.190   1: 4545   B   : 1061   Kelloggs:9888  
##  Median :1.060   Median :3.790             NONE:19333   Post    :4773  
##  Mean   :1.016   Mean   :3.763                                         
##  3rd Qu.:1.120   3rd Qu.:4.390                                         
##  Max.   :4.000   Max.   :9.990                                         
## 

Comments

  • Volume is symmetric. However, the max is 4.000, so we assume there are some outliers
  • Price is symmetric. However, the min is 0.250 so we may have some outliars
  • week is told to be a time series
  • Brand has information about producer and brands together, so it would be difficult to see the relation between the producers
  • Because promo’s 1st Qu, median, and 3rd Qu. are all 0. Thus, at least more than 75% of them do not have promotions
  • Kelloggs sells the most out of three companies (What is the Kelloggs’s strategy?)

Questions

  • Is there significant difference of sales between promo and non-promo cereals?
  • Is certain brand cheaper than the other?
  • Is company/brand name an important factor for more sales?
  • Is there significant difference between none-ad and with advertisement

Uni-variate graphical

Categorical

grid.arrange(
# company
ggplot(data = mtp, mapping = aes(x = company)) +
  geom_bar(),
# brand
ggplot(data = mtp, mapping = aes(x = brand)) +
  geom_bar(),
# flavor
ggplot(data = mtp, mapping = aes(x = flavor)) +
  geom_bar(),
# package
ggplot(data = mtp, mapping = aes(x = package)) +
  geom_bar(),
# promo
ggplot(data = mtp, mapping = aes(x = promo)) +
  geom_bar(),
# ad
ggplot(data = mtp, mapping = aes(x = ad)) +
  geom_bar(),
ncol = 1 )

Comments

  • Most are Regular and Toasted (flavor)
  • Most cereal package have no advertisement
  • Company, Post, can be seen completing 2nd place in the market after GM

Quantative

Units

grid.arrange(
# Create histogram
ggplot(data = mtp, mapping = aes(x = units)) + 
    geom_histogram(),

# Add boxplot
ggplot(data = mtp, mapping = aes(x = 1)) + 
    geom_boxplot(mapping = aes(y = units)) +
    coord_flip(), # use to have same x-axis on both graphs

# Set number of columns in grid.arrange
ncol = 1 )

Comments

  • Skewed
  • Many rows have smaller units/sales

Volume

grid.arrange(
# Create histogram
ggplot(data = mtp, mapping = aes(x = volume)) + 
    geom_histogram(),

# Add boxplot
ggplot(data = mtp, mapping = aes(x = 1)) + 
    geom_boxplot(mapping = aes(y = volume)) +
    coord_flip(), # use to have same x-axis on both graphs

# Set number of columns in grid.arrange
ncol = 1 )

Comments

  • Skewed

Price

grid.arrange(
# Create histogram
ggplot(data = mtp, mapping = aes(x = price)) + 
    geom_histogram(),

# Add boxplot
ggplot(data = mtp, mapping = aes(x = 1)) + 
    geom_boxplot(mapping = aes(y = price)) +
    coord_flip(), # use to have same x-axis on both graphs

# Set number of columns in grid.arrange
ncol = 1 )

Comments

  • Symmetric

Questions

  • Why units’s distribution is like this above.

Multi-variate non-graphical

Categorical

Company and Flavor

# Company and Flavor
mtp %>% 
  tabyl(flavor, company) %>% # creates table of counts
  adorn_totals(where = c("row", "col")) # Total margins
##          flavor   GM Kelloggs Post Total
##  CINNAMON TOAST 1834        0    0  1834
##           COCOA 1020      881    0  1901
##           FRUIT    0     2192    0  2192
##         REGULAR 1203     2840 4773  8816
##         TOASTED 3132     3975    0  7107
##           Total 7189     9888 4773 21850
mtp %>% 
  tabyl(flavor, company) %>% 
  adorn_totals(where = c("row", "col")) %>% 
  adorn_percentages(denominator = "all") %>% # creates proportions
  adorn_rounding(2) # round decimals
##          flavor   GM Kelloggs Post Total
##  CINNAMON TOAST 0.08     0.00 0.00  0.08
##           COCOA 0.05     0.04 0.00  0.09
##           FRUIT 0.00     0.10 0.00  0.10
##         REGULAR 0.06     0.13 0.22  0.40
##         TOASTED 0.14     0.18 0.00  0.33
##           Total 0.33     0.45 0.22  1.00

Comments

  • Post sells only “Regular” flavor but they remarkably got 22% share in the cereal market.
  • Only GM sells “Cinnamon flavor”, and they got number one share in “cocoa flavor”, but “cocoa” only has 9% of whole cereals…
  • Other than “cinnamon and cocoa”, GM is behind Kelloggs and Post


Package and brand(comapny)

# package and brand
mtp %>% 
  tabyl(brand, package) %>% # creates table of counts
  adorn_totals(where = c("row", "col")) # Total margins
##                          brand   BOX CUP Total
##         GENERAL MILLS CHEERIOS  1411  47  1458
##  GENERAL MILLS CINNAMON TST CR  1774  60  1834
##      GENERAL MILLS COCOA PUFFS  1020   0  1020
##              GENERAL MILLS KIX  1196   0  1196
##     GENERAL MILLS LUCKY CHARMS  1624  57  1681
##        KELLOGGS COCOA KRISPIES   877   4   881
##           KELLOGGS FROOT LOOPS  2099  93  2192
##        KELLOGGS FROSTED FLAKES  2158 137  2295
##   KELLOGGS FROSTED MINI WHEATS  1539  35  1574
##           KELLOGGS RAISIN BRAN  1266   0  1266
##         KELLOGGS RICE KRISPIES  1425  25  1450
##           KELLOGGS SMART START  1134   0  1134
##             KELLOGGS SPECIAL K  1305  86  1391
##                POST GRAPE NUTS  1289   0  1289
##            POST SHREDDED WHEAT  1189   0  1189
##                          Total 21306 544 21850
mtp %>% 
  tabyl(brand, package) %>% 
  adorn_totals(where = c("row", "col")) %>% 
  adorn_percentages(denominator = "all") %>% # creates proportions
  adorn_rounding(2) # round decimals
##                          brand  BOX  CUP Total
##         GENERAL MILLS CHEERIOS 0.06 0.00  0.07
##  GENERAL MILLS CINNAMON TST CR 0.08 0.00  0.08
##      GENERAL MILLS COCOA PUFFS 0.05 0.00  0.05
##              GENERAL MILLS KIX 0.05 0.00  0.05
##     GENERAL MILLS LUCKY CHARMS 0.07 0.00  0.08
##        KELLOGGS COCOA KRISPIES 0.04 0.00  0.04
##           KELLOGGS FROOT LOOPS 0.10 0.00  0.10
##        KELLOGGS FROSTED FLAKES 0.10 0.01  0.11
##   KELLOGGS FROSTED MINI WHEATS 0.07 0.00  0.07
##           KELLOGGS RAISIN BRAN 0.06 0.00  0.06
##         KELLOGGS RICE KRISPIES 0.07 0.00  0.07
##           KELLOGGS SMART START 0.05 0.00  0.05
##             KELLOGGS SPECIAL K 0.06 0.00  0.06
##                POST GRAPE NUTS 0.06 0.00  0.06
##            POST SHREDDED WHEAT 0.05 0.00  0.05
##                          Total 0.98 0.02  1.00

Package and company

# package and company
mtp %>% 
  tabyl(company, package) %>% # creates table of counts
  adorn_totals(where = c("row", "col")) # Total margins
##   company   BOX CUP Total
##        GM  7025 164  7189
##  Kelloggs  9645 243  9888
##      Post  4636 137  4773
##     Total 21306 544 21850
mtp %>% 
  tabyl(company, package) %>% 
  adorn_totals(where = c("row", "col")) %>% 
  adorn_percentages(denominator = "all") %>% # creates proportions
  adorn_rounding(2) # round decimals
##   company  BOX  CUP Total
##        GM 0.32 0.01  0.33
##  Kelloggs 0.44 0.01  0.45
##      Post 0.21 0.01  0.22
##     Total 0.98 0.02  1.00

Comments

  • Each company only sells 1% of cereal in a “CUP”
  • Post has more “CUPs” relative to its sales size, Kelloggs has least CUPs in comparison with its “BOX”


Company and Ad

# ad and company
mtp %>% 
  tabyl(company, ad) %>% # creates table of counts
  adorn_totals(where = c("row", "col")) # Total margins
##   company    A    B  NONE Total
##        GM  442  272  6475  7189
##  Kelloggs  762  540  8586  9888
##      Post  252  249  4272  4773
##     Total 1456 1061 19333 21850
mtp %>% 
  tabyl(company, ad) %>% 
  adorn_totals(where = c("row", "col")) %>% 
  adorn_percentages(denominator = "all") %>% # creates proportions
  adorn_rounding(2) # round decimals
##   company    A    B NONE Total
##        GM 0.02 0.01 0.30  0.33
##  Kelloggs 0.03 0.02 0.39  0.45
##      Post 0.01 0.01 0.20  0.22
##     Total 0.07 0.05 0.88  1.00

Comments

  • Kellogs put items on advertisement the most as whole and as proportion
    • Ratio for GM: 2:1:30 (9% has been advertised)
    • Ratio for Kelloggs: 3:2:39 (11% has been advertised)
    • Ratio for Post: 1:1:20 (9% has been advertised)


Company and Promo

# promo and company
mtp %>% 
  tabyl(company, promo) %>% # creates table of counts
  adorn_totals(where = c("row", "col")) # Total margins
##   company     0    1 Total
##        GM  5909 1280  7189
##  Kelloggs  7651 2237  9888
##      Post  3745 1028  4773
##     Total 17305 4545 21850
mtp %>% 
  tabyl(company, promo) %>% 
  adorn_totals(where = c("row", "col")) %>% 
  adorn_percentages(denominator = "all") %>% # creates proportions
  adorn_rounding(2) # round decimals
##   company    0    1 Total
##        GM 0.27 0.06  0.33
##  Kelloggs 0.35 0.10  0.45
##      Post 0.17 0.05  0.22
##     Total 0.79 0.21  1.00

Comments

  • Kellogs put items on promo the most as a whole, Kellogs and Post work on promotion on the same propotion and greater than GM.
    • Ratio for GM: 27:6 (aprox.18% has been promoted)
    • Ratio for Kelloggs: 7:2 (aprox.22% has been advertised)
    • Ratio for Post: 17:5 (aprox.22% has been advertised)


Quantitative

# Correlation table
mtp %>% 
  select_if(is.numeric) %>% # Use to select just the numeric variables
  cor() %>% 
  round(2) %>% 
  kable()
       week   units   volume   price

week 1.00 -0.03 -0.02 0.03 units -0.03 1.00 0.02 -0.19 volume -0.02 0.02 1.00 0.54 price 0.03 -0.19 0.54 1.00

Comments

  • Price and volume has correlation

Question

  • Somehow units and price does not have correlation. Need to figure it out.


Multi-variate graphical

Categorical

# More detail on promo,ad and company
grid.arrange(
  # ad and company
  mtp %>% 
    ggplot(mapping = aes(x = ad, fill = company)) + 
      geom_bar(position = "dodge"),
  # promo and company
  mtp %>% 
    ggplot(mapping = aes(x = promo , fill = company)) + 
      geom_bar(position = "dodge"),
  # flavor and company
  mtp %>% 
    ggplot(mapping = aes(x = flavor , fill = company)) + 
      geom_bar(position = "dodge"),
  # package and company
  mtp %>% 
    ggplot(mapping = aes(x = package , fill = company)) +
      geom_bar(position = "dodge"),
ncol = 1
)

# More detail on company and ad
grid.arrange(
  mtp %>% 
    ggplot(mapping = aes(x = company, fill = ad)) + 
      geom_bar(position = "dodge") +
      coord_flip(),
  mtp %>% 
    ggplot(mapping = aes(x = company, fill = ad)) +
      geom_bar(position = "fill") +
      coord_flip(),
ncol = 1
)

# More detail on brand and promo
grid.arrange(
    # Cluster of counts
  mtp %>% 
    ggplot(mapping = aes(x = company, fill = promo)) + 
      geom_bar(position = "dodge") +
      coord_flip(),
  # Proportion of counts
  mtp %>% 
    ggplot(mapping = aes(x = company, fill = promo)) +
      geom_bar(position = "fill") +
      coord_flip(),

ncol = 1
)

Comments

  • Confirm that GM has less advertisement and promo comparing with competitors
  • Kellogs has more advertisement and promotion than the other
mtp %>%
  group_by(ad, company) %>% 
  summarise(count = n()) %>% 
  ggplot(aes(ad, company)) + 
  geom_tile(aes(fill = count))

Comments

  • Kelloggs has slightly ligher color in A and B for ad. However, three company has almost the same pattern in advertisement.

Quantitative

# price and units
mtp %>% 
  ggplot(mapping = aes(x = units, y = price)) +
  geom_point()

Comments

  • Price and units do not seem to have any relations.
# volume and price
mtp %>% 
  ggplot(mapping = aes(x = volume, y = price)) +
  geom_point()

Comments

  • Price and volume seem to have relations/patterns.

Question

  • What does seperate between volume less than 0.5 and above? Advertisement or Promo?
# volume and units
mtp %>% 
  ggplot(mapping = aes(x = volume, y = units)) +
  geom_point()

Comments

  • volume and units do not seem to have any relations.

Categorical and quantitative

mtp %>%
  select(week, units, brand, flavor, package, volume, price, promo, ad, company) %>%
  ggpairs()

Comments

  • Look more closely at:
    • price and volume
    • price and flavor
    • volume and company


# volume, price and company
mtp %>% 
    ggplot(mapping = aes(x = volume, y = price, color = company)) +
      geom_point() +
      geom_smooth(method = "lm", se = FALSE)

# volume, price and brand
mtp %>% 
    ggplot(mapping = aes(x = volume, y = price, color = brand)) +
      geom_point() +
      geom_smooth(method = "lm", se = FALSE)

# volume, price and flavor
mtp %>% 
    ggplot(mapping = aes(x = volume, y = price, color = flavor)) +
      geom_point() +
      geom_smooth(method = "lm", se = FALSE)

# volume, price and ad
mtp %>% 
    ggplot(mapping = aes(x = volume, y = price, color = ad)) +
      geom_point() +
      geom_smooth(method = "lm", se = FALSE)

Comments

  • Advertised has lower price on its similar volume
  • GM’s creal volume has more explanation to price than the other two companies’.
  • Kelloggs have smaller range in terms of volume
grid.arrange(
  # volume and flavor
  mtp %>% 
    ggplot(mapping = aes(x = flavor, y = volume)) +
      geom_boxplot(),
  # price nad flavor
  mtp %>% 
    ggplot(mapping = aes(x = flavor, y = price)) +
      geom_boxplot(),
    # price nad flavor
  mtp %>% 
    ggplot(mapping = aes(x = company, y = price)) +
      geom_boxplot(),
      # price nad flavor
  mtp %>% 
    ggplot(mapping = aes(x = company, y = units)) +
      geom_boxplot(),

ncol = 2
)

Comments

  • Advertised has lower price at sale on its similar volume

Questions

  • We need to see the relation between Units and ad/promo of company


Units and company/ad

mtp %>%
  group_by(ad, company) %>% 
  summarise(med_units = median(units)) %>% 
  ggplot(aes(ad, company)) +
  geom_tile(aes(fill = med_units))

Comments

  • GM’s cereal with mediam advertisement has more sales/units than other
  • However, because Kelloggs sells more, so we assume that Kelloggs has more power on its brand or there are more stores selling Kelloggs


Units and company/promo

mtp %>%
  group_by(promo, company) %>% 
  summarise(med_units = median(units)) %>% 
  ggplot(aes(promo, company)) +
  geom_tile(aes(fill = med_units))

Comments

  • GM’s cereal with promotion has the best sales/units than other


Questions

  • We need to see why GM has fewer units sold in total.


Detailed EDA

Do units differ by promo, ads, or company(producer)

# promo and units
mtp %>% 
  ggplot(mapping = aes(x = units)) +
  geom_histogram() +
  facet_wrap( ~ promo)
  • Propotionally cereal with promotion are sold more units, especially for higher units
  • Units without promotion have steep down but the units with promotion have less steepness
  • Promotion may have influenced the number of units sold
# ad and units
mtp %>% 
  ggplot(mapping = aes(x = units)) +
  geom_histogram() +
  facet_wrap( ~ ad)
  • Pattern holds?
  • Units without promotion have steep down but the units with promotion have less steepness
# company and units
mtp %>% 
  ggplot(mapping = aes(x = units)) +
  geom_histogram() +
  facet_wrap( ~ company)

Comments

  • GM’s cereal with promotion has the best sales/units than other
  • Pattern holds


mtp %>% 
    ggplot(mapping = aes(x = volume, y = units)) +
    geom_point() +
    geom_hline(yintercept = median(mtp$units), color="blue") +
    annotate(geom = "text", label = "units mean", x=3, y = 6, color="blue") +
    geom_smooth(mapping = aes(color = promo), method = "lm", se = FALSE)

Comments

  • if there is a promotion, there is more sales with volume


mtp %>% 
    ggplot(mapping = aes(x = volume, y = units)) +
    geom_point() +
    geom_hline(yintercept = median(mtp$units), color="blue") +
    annotate(geom = "text", label = "units mean", x=3, y = 6, color="blue") +
    geom_smooth(mapping = aes(color = ad), method = "lm", se = FALSE)

Comments

  • both Ad A and B does not have points for heavier cereal product. We may need more information/data around there if possible


mtp %>% 
    ggplot(mapping = aes(x = volume, y = units)) +
    geom_point() +
    geom_hline(yintercept = median(mtp$units), color="blue") +
    annotate(geom = "text", label = "units mean", x=3, y = 6, color="blue") +
    geom_smooth(mapping = aes(color = company), method = "lm", se = FALSE)

Comments

  • Company name does not have any relation with the units


Stats EDA

#promo
(t <- t.test(mtp$units[mtp$promo == '1'], mtp$units[mtp$promo == '0'], conf.level = 0.95))
## 
##  Welch Two Sample t-test
## 
## data:  mtp$units[mtp$promo == "1"] and mtp$units[mtp$promo == "0"]
## t = 27.59, df = 6398.8, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  3.070437 3.540125
## sample estimates:
## mean of x mean of y 
## 11.196700  7.891419
#ad
(t <- t.test(mtp$units[mtp$ad == 'B'], mtp$units[mtp$ad == 'NONE'], conf.level = 0.95))
## 
##  Welch Two Sample t-test
## 
## data:  mtp$units[mtp$ad == "B"] and mtp$units[mtp$ad == "NONE"]
## t = 12.471, df = 1145.6, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  2.526639 3.470063
## sample estimates:
## mean of x mean of y 
## 11.268615  8.270263

Comments

  • reject the null hypothesis that the promotion does not impact the units/sales


z <- qnorm(0.975) #95 percent

mtp %>%
  group_by(promo) %>%
  summarise(mn = mean(units), sd = sd(units), n = n(), ci = z * sd/sqrt(n)) %>%
  ggplot(aes(x = promo, y = mn)) +
  geom_bar(stat = "identity", position = "dodge") +
  geom_errorbar(aes(ymin = mn - ci, ymax = mn + ci), width = 0.5, position = position_dodge(0.9)) +
  labs(title = "Units/sales difference by Promotion with error bar")

mtp %>%
  group_by(ad) %>%
  summarise(mn = mean(units), sd = sd(units), n = n(), ci = z * sd/sqrt(n)) %>%
  ggplot(aes(x = ad, y = mn)) +
  geom_bar(stat = "identity", position = "dodge") +
  geom_errorbar(aes(ymin = mn - ci, ymax = mn + ci), width = 0.5, position = position_dodge(0.9)) +
  labs(title = "Units/sales difference by advertisement type with error bar")

Do brand/company units/sales vary based on promotion?

chisq.test(table(mtp$brand, mtp$promo))
## 
##  Pearson's Chi-squared test
## 
## data:  table(mtp$brand, mtp$promo)
## X-squared = 180.16, df = 14, p-value < 2.2e-16

Comments

  • X-square is large,so reject the null
  • P-value is small, so reject the null
  • According to the test,we cannot see what is different and what is similar


How do company units/sales vary with promotion

C_P_n <- mtp %>%
  group_by(company, promo) %>%
  summarise(n = n())

C_P_n_ci <- multinomialCI(t(C_P_n[, 3]), 0.05)

C_P_tab <- mtp %>%
  group_by(company, promo) %>%
  summarise(prop = round(n()/sum(nrow(mtp)), 3))

C_P_tab$ci_l <- round(C_P_n_ci[,1], 3)
C_P_tab$ci_u <- round(C_P_n_ci[,2], 3)

htmlTable(C_P_tab)
company promo prop ci_l ci_u
1 GM 0 0.27 0.263 0.277
2 GM 1 0.059 0.052 0.066
3 Kelloggs 0 0.35 0.343 0.357
4 Kelloggs 1 0.102 0.095 0.109
5 Post 0 0.171 0.164 0.178
6 Post 1 0.047 0.04 0.054
# Graph of proportions with confidence intervals
C_P_tab %>% 
  ggplot(aes(x = promo, y = prop, fill = company)) +
  geom_bar(stat="identity", position = "dodge") +
  geom_text(aes(label = round(prop, 2)), vjust = -4, color = "black", # vjust moves lables above CI
            position = position_dodge(0.9), size = 4) +
  geom_errorbar(aes(ymin = ci_l, ymax = ci_u), 
                width = 0.4, position = position_dodge(0.9))

How do company units/sales vary with advertisement

C_A_n <- mtp %>%
  group_by(company, ad) %>%
  summarise(n = n())
C_A_n_ci <- multinomialCI(t(C_A_n[, 3]), 0.05)

C_A_tab <- mtp %>%
  group_by(company, ad) %>%
  summarise(prop = round(n()/sum(nrow(mtp)), 3))

C_A_tab$ci_l <- round(C_A_n_ci[,1], 3)
C_A_tab$ci_u <- round(C_A_n_ci[,2], 3)

htmlTable(C_A_tab)
company ad prop ci_l ci_u
1 GM A 0.02 0.013 0.027
2 GM B 0.012 0.005 0.02
3 GM NONE 0.296 0.289 0.304
4 Kelloggs A 0.035 0.028 0.042
5 Kelloggs B 0.025 0.018 0.032
6 Kelloggs NONE 0.393 0.386 0.4
7 Post A 0.012 0.004 0.019
8 Post B 0.011 0.004 0.019
9 Post NONE 0.196 0.188 0.203
# Graph of proportions with confidence intervals
C_A_tab %>% 
  ggplot(aes(x = ad, y = prop, fill = company)) +
  geom_bar(stat="identity", position = "dodge") +
  geom_text(aes(label = round(prop, 2)), vjust = -4, color = "black", # vjust moves lables above CI
            position = position_dodge(0.9), size = 4) +
  geom_errorbar(aes(ymin = ci_l, ymax = ci_u), 
                width = 0.4, position = position_dodge(0.9))

Comments

  • Kelloggs has the most reliability in units/sales in any situation regardless if it has promotion or advertisement


Significance of Correlation

mtp %>% 
  select_if(is.numeric) %>% 
  as.matrix() %>% 
  rcorr()
##         week units volume price
## week    1.00 -0.03  -0.02  0.03
## units  -0.03  1.00   0.02 -0.19
## volume -0.02  0.02   1.00  0.54
## price   0.03 -0.19   0.54  1.00
## 
## n= 21850 
## 
## 
## P
##        week   units  volume price 
## week          0.0000 0.0031 0.0000
## units  0.0000        0.0005 0.0000
## volume 0.0031 0.0005        0.0000
## price  0.0000 0.0000 0.0000

Multi-linear regression

# Set up mtp data set for regression
mtp_lm <- mtp %>% 
  mutate_if(is.integer, as.factor)

# Logit regression with general linear model (glm)
mod <- glm(units ~ company + price + volume + flavor + package + promo + ad, 
           family = binomial(link='logit'), 
           data = mtp_lm)

# Review output
summary(mod)
## 
## Call:
## glm(formula = units ~ company + price + volume + flavor + package + 
##     promo + ad, family = binomial(link = "logit"), data = mtp_lm)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -3.0883   0.3130   0.3940   0.4767   1.6209  
## 
## Coefficients:
##                 Estimate Std. Error z value Pr(>|z|)    
## (Intercept)      3.36980    0.19672  17.130  < 2e-16 ***
## companyKelloggs -0.55445    0.06866  -8.075 6.74e-16 ***
## companyPost     -0.75250    0.09056  -8.309  < 2e-16 ***
## price           -0.41682    0.03550 -11.740  < 2e-16 ***
## volume           1.13021    0.09726  11.621  < 2e-16 ***
## flavorCOCOA     -0.69062    0.12729  -5.426 5.78e-08 ***
## flavorFRUIT     -0.50820    0.13551  -3.750 0.000177 ***
## flavorREGULAR   -0.08374    0.12380  -0.676 0.498761    
## flavorTOASTED    0.35526    0.11885   2.989 0.002797 ** 
## packageCUP      -0.10245    0.18216  -0.562 0.573853    
## promo1           0.40156    0.07932   5.062 4.14e-07 ***
## adB              0.20834    0.18129   1.149 0.250464    
## adNONE          -0.19905    0.11495  -1.732 0.083329 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 13608  on 21849  degrees of freedom
## Residual deviance: 12917  on 21837  degrees of freedom
## AIC: 12943
## 
## Number of Fisher Scoring iterations: 5

Comments

  • volume, flavorTOASTED, promo1 and adB are significant impact on units sold


par(mfrow = c(1, 1))
# units
plot(mtp$units, mod$residuals)

# company
plot(mtp$ad, mod$residuals)

plot(mtp$volume, mod$residuals)

# Flavor
plot(mtp$flavor, mod$residuals)

# ad
plot(mtp$ad, mod$residuals)

# promo
plot(mtp$promo, mod$residuals)
  • units
  • Ad A and B are correlative but A has one outlier residuals
  • units have residuals when unit is smaller
  • some observed variable is far away from the regression. Threfore, we may need more variables and sample to run more accurate


Visualization of Multiple Regression

coe <- summary(mod)$coefficients # get coefficients and related stats
coe_CI <- as.data.frame(cbind(coe[-1, ], confint(mod)[-1, ])) # find and bind CI, remove Intercept 
## Waiting for profiling to be done...
# Rename results data frame
names(coe_CI) <- c("estimate", "se", "t", "pval","low_CI","high_CI") 


# Order base on p-value
htmlTable(round(coe_CI[order(coe_CI$pval, decreasing = FALSE), ], 3))
estimate se t pval low_CI high_CI
price -0.417 0.036 -11.74 0 -0.486 -0.347
volume 1.13 0.097 11.621 0 0.94 1.322
companyPost -0.752 0.091 -8.309 0 -0.931 -0.576
companyKelloggs -0.554 0.069 -8.075 0 -0.69 -0.421
flavorCOCOA -0.691 0.127 -5.426 0 -0.943 -0.443
promo1 0.402 0.079 5.062 0 0.248 0.559
flavorFRUIT -0.508 0.136 -3.75 0 -0.776 -0.245
flavorTOASTED 0.355 0.119 2.989 0.003 0.119 0.585
adNONE -0.199 0.115 -1.732 0.083 -0.431 0.021
adB 0.208 0.181 1.149 0.25 -0.143 0.569
flavorREGULAR -0.084 0.124 -0.676 0.499 -0.329 0.157
packageCUP -0.102 0.182 -0.562 0.574 -0.452 0.264
# reorder by p-value
(g1 <- ggplot(coe_CI, aes(x = estimate, y = reorder(row.names(coe_CI),desc(pval)))) +
  geom_point(size = 3) +
  xlim(min(coe_CI$low_CI), max(coe_CI$high_CI)) +
  ylab("Variable") +
  xlab("Coefficient") +
  theme_bw() 
)
  • Volume has mroe the most significant impact on units/sales because its p-value is bigger than the alpha
  • Variables with p-value less than alpha could be influencial on units/sales?
# Use geom_segment to illustrate CI
(g2 <- g1 +
  geom_vline(xintercept = 0, color = "red")) +
  geom_segment(aes(yend = reorder(row.names(coe_CI),desc(pval))), 
               xend = coe_CI$high_CI, color = "Blue") +
  geom_segment(aes(yend = reorder(row.names(coe_CI),desc(coe_CI$pval))), 
               xend = coe_CI$low_CI, color = "Blue") +
  xlab("Coefficient with Confidence Interval") 
  • 4 variables have positive p-value, which indicates positive outcome on units/sale.
    • volume
    • promo of 1
    • flavor toasted
    • ad of B
  • variables with negative coefficient have negative impact on units/sales.
    • price
    • ad none

Summary

  • Strategy:
    • Kelloggs has more advertisement, promotion (proportionally and quantatively), unit sales.
    • Post has less unit sales but they are concentrated on a few products/brand
  • What to recommend to GM
    • Promotion and advertisement are effective on unit sales

Create professional quality graph

Graph for promo finding:

z <- qnorm(0.975) #95 percent
levels(mtp$ad)[1] <- "Mediam Advertisement"
levels(mtp$ad)[2] <- "Small Advertisement"
levels(mtp$ad)[3] <- "No Advertisement"

levels(mtp$promo)[1] <- "Without Promotion"
levels(mtp$promo)[2] <- "With Promotion"

mtp %>%
  group_by(promo) %>%
  summarise(mn = mean(units), sd = sd(units), n = n(), ci = z * sd/sqrt(n)) %>%
  ggplot(aes(x = reorder(promo, mn) , y = mn)) +
  theme_classic() +
  geom_bar(stat = "identity", position = "dodge" ,fill = "light blue") +
  #geom_errorbar(aes(ymin = mn - ci, ymax = mn + ci), width = 0.5, position = position_dodge(0.9)) +
  labs(title = "Promotion has contributed more sales", subtitle = "Cereals with promotion have approximately 42% more sales",x = "", y ="Average weekly units sold", caption = "From Technical Appendix: Statistic EDA") +
  scale_y_continuous(breaks = seq(0, 16, 1)) +
  coord_flip() +
  ggsave(filename = "promo.png")

mtp %>%
  group_by(ad) %>%
  summarise(mn = mean(units), sd = sd(units), n = n(), ci = z * sd/sqrt(n)) %>%
  ggplot(aes(x = reorder(ad, mn) , y = mn)) +
  theme_classic() +
  geom_bar(stat = "identity", position = "dodge", fill = "light blue") +
  #geom_errorbar(aes(ymin = mn - ci, ymax = mn + ci), width = 0.5, position = position_dodge(0.9)) +
  labs(title = "Advertisement has an impact on our sales", subtitle = "Small advertisement have more impact on our sales than mediam advertisement", x = "", y = "Average weekly units sold", caption = "From Technical Appendix: Statistic EDA") +
  scale_y_continuous(breaks = seq(0, 16, 1)) +
  coord_flip() +
  ggsave(filename = "ad.png")

(g1 <- ggplot(coe_CI, aes(x = estimate, y = reorder(row.names(coe_CI),desc(pval)))) +
  geom_point(size = 3) +
  xlim(min(coe_CI$low_CI), max(coe_CI$high_CI)) +
  ylab("Variable") +
  xlab("Coefficient") +
  theme_bw() +
  geom_vline(xintercept = 0, color = "red") +
  labs(title = "What customers makes customer buy more and buy less", subtitle = "Volume, Promotion, Flavor and Small ad have positive influence on sales", caption = "From Technical Appendix: Statistic EDA", x ="Unit Sale Influence level", y ="") +
  annotate(geom = "text", label = "Improves Unit sales", x=1, y = 1, color="dark blue") +
  annotate(geom = "text", label = "Decrease Unit sales", x=-0.7, y = 1, color="dark red") +
  theme_classic() +
  ggsave(filename = "coefficient.png")

)
## Saving 7 x 5 in image